########################################################################################################################################################
# May 20 2021
# Replication File
# Rebecca Cordell, K. Chad Clay, Christopher J. Fariss, Reed M. Wood and Thorin M. Wright
# Disaggregating Repression: Identifying Physical Integrity Rights Allegations in Human Rights Reports
# International Studies Quarterly
# https://academic.oup.com/isq 
########################################################################################################################################################

# Clear work environment
rm(list=ls())

# Required packages
library(rworldmap)  
library(RColorBrewer)
library(dplyr)
library(data.table)
library(stm)
library(countrycode)
library(tm)
library(readxl)
library(caret)
library(klaR)
library(e1071)
library(ngram)
library(PRROC)
library(countrycode)
library(stm)
library(tm)
library(MASS)
library(ggplot2)
library(rnaturalearth)
library(rnaturalearthdata)
library(ggplot2)
library(dplyr)
library(sf)
library(raster)
library(ggspatial)
library(mapproj)

# Read in SNARP allegation data
snarp_allegation_data<-read.csv("snarp_allegation_data.csv", header=TRUE, stringsAsFactors=FALSE)

########################
# Table 1: Training data
########################

# Subset training data
train<-snarp_allegation_data[!is.na(snarp_allegation_data$train.target),] 
train_allegs<-train[train$train.target==1,]
train_countries<-train_allegs[train_allegs$iso3c=="AGO" |
                           train_allegs$iso3c=="BLR" |
                           train_allegs$iso3c=="MEX" |
                           train_allegs$iso3c=="NGA" |
                           train_allegs$iso3c=="PHL" |
                           train_allegs$iso3c=="GBR",]
train_2012<-train_allegs[train_allegs$year==2012 &
                            !(train_allegs$iso3c=="AGO" |
                           train_allegs$iso3c=="BLR" |
                           train_allegs$iso3c=="MEX" |
                           train_allegs$iso3c=="NGA" |
                           train_allegs$iso3c=="PHL" |
                           train_allegs$iso3c=="GBR"),]
train_random<-train_allegs[train_allegs$year!=2012 &
                              !(train_allegs$iso3c=="AGO" |
                                train_allegs$iso3c=="BLR" |
                                train_allegs$iso3c=="MEX" |
                                train_allegs$iso3c=="NGA" |
                                train_allegs$iso3c=="PHL" |
                                train_allegs$iso3c=="GBR"),]

# Calculate number of allegation sentences
train_countries_n<-train_countries %>% count(country)
train_2012_n<-as.factor(nrow(train_2012))
train_random_n<-nrow(train_random)

# Calculate coverage by years
train_countries_year<-train_countries %>%
  group_by(country) %>%
  summarize(min = min(year),
            max = max(year))
train_countries_year$year <- paste(train_countries_year$min,"-",train_countries_year$max)
train_2012_year<-train_2012 %>%
  summarize(year = unique(year))
train_2012_year$year<-as.factor(train_2012_year$year)
train_random_year<-train_random %>%
  summarize(min = min(year),
            max = max(year))
train_random_year$year <- paste(train_random_year$min,"-",train_random_year$max)

# Create Table 1
train_2012_country<-"All countries"
train_random_country<-"Random sample"
train_countries_table1<-as.data.frame(cbind(train_countries_year$country, train_countries_year$year, train_countries_n$n))
train_2012_table1<-as.data.frame(cbind(train_2012_country,train_2012_year, train_2012_n))
train_random_table1<-as.data.frame(cbind(train_random_country,train_random_year$year, train_random_n))
setnames(train_countries_table1, old = c('V1','V2', 'V3'), new = c('country','year', 'number of allegation sentences'))
setnames(train_2012_table1, old = c('train_2012_country','train_2012_n'), new = c('country','number of allegation sentences'))
setnames(train_random_table1, old = c('train_random_country','V2', 'train_random_n'), new = c('country','year','number of allegation sentences'))
table1<-rbind(train_countries_table1,train_2012_table1,train_random_table1)
table1

############################################################################
# Table 2: In sample model accuracy with 5x repeated k-fold cross validation
############################################################################

# Read in training data document term matrix
train<-read.csv("training_data_dtm.csv", header=TRUE, stringsAsFactors=FALSE)

# Subset 20% of the training data not used for training
train_test<-train[train$train.test.dta==1,]

# Calculate the accuracy,	precision,	recall,	F1 score,	AUC ROC, AUC PR using the support vector machine model
# Create confusion matrix
confusion_matrix_svm_table2<-table(train_test$svm.pred,train_test$train.target)
## Calculate true positive and true negative rate
TP_svm_table2 <- confusion_matrix_svm_table2[2,2]
TN_svm_table2 <- confusion_matrix_svm_table2[1,1]
## Calculate false positive and true negative rate
FP_svm_table2 <- confusion_matrix_svm_table2[1,2]
FN_svm_table2 <- confusion_matrix_svm_table2[2,1]
## Calculate accuracy
accuracy_svm_table2 <- (TP_svm_table2 + TN_svm_table2) / (TP_svm_table2 + TN_svm_table2 + FP_svm_table2 + FN_svm_table2)
## Calculate false negative rate
false_negative_rate_svm_table2 <- FN_svm_table2 / (TP_svm_table2 + TN_svm_table2 + FP_svm_table2 + FN_svm_table2)
## Calculate false positive rate
false_positive_rate_svm_table2 <- FP_svm_table2 / (TP_svm_table2 + TN_svm_table2 + FP_svm_table2 + FN_svm_table2)
## Calculate precision
precision_svm_table2 <- (TP_svm_table2) / (TP_svm_table2 + FP_svm_table2)
## Calculate recall
recall_svm_table2 <- (TP_svm_table2) / (TP_svm_table2 + FN_svm_table2)
# Calculate F1 score
F1_svm_table2 <- (2 * precision_svm_table2 * recall_svm_table2) / (precision_svm_table2 + recall_svm_table2)
# Calculate AUC ROC and AUC PR 	
fg_svm_table2 <- train_test$svm.pred[train_test$train.target == 1]
bg_svm_table2 <- train_test$svm.pred[train_test$train.target == 0]
auc_roc_svm_table2 <- roc.curve(scores.class0 = fg_svm_table2, scores.class1 = bg_svm_table2, curve = T)
auc_pr_svm_table2 <- pr.curve(scores.class0 = fg_svm_table2, scores.class1 = bg_svm_table2, curve = T)

# Calculate the accuracy,	precision,	recall,	F1 score,	AUC ROC, AUC PR using the naive bayes model
# Create confusion matrix
confusion_matrix_nb_table2<-table(train_test$nb.pred,train_test$train.target)
## Calculate true positive and true negative rate
TP_nb_table2 <- confusion_matrix_nb_table2[2,2]
TN_nb_table2 <- confusion_matrix_nb_table2[1,1]
## Calculate false positive and true negative rate
FP_nb_table2 <- confusion_matrix_nb_table2[1,2]
FN_nb_table2 <- confusion_matrix_nb_table2[2,1]
## Calculate accuracy
accuracy_nb_table2 <- (TP_nb_table2 + TN_nb_table2) / (TP_nb_table2 + TN_nb_table2 + FP_nb_table2 + FN_nb_table2)
## Calculate false negative rate
false_negative_rate_nb_table2 <- FN_nb_table2 / (TP_nb_table2 + TN_nb_table2 + FP_nb_table2 + FN_nb_table2)
## Calculate false positive rate
false_positive_rate_nb_table2 <- FP_nb_table2 / (TP_nb_table2 + TN_nb_table2 + FP_nb_table2 + FN_nb_table2)
## Calculate precision
precision_nb_table2 <- (TP_nb_table2) / (TP_nb_table2 + FP_nb_table2)
## Calculate recall
recall_nb_table2 <- (TP_nb_table2) / (TP_nb_table2 + FN_nb_table2)
# Calculate F1 score
F1_nb_table2 <- (2 * precision_nb_table2 * recall_nb_table2) / (precision_nb_table2 + recall_nb_table2)
# Calculate AUC ROC and AUC PR 	
fg_nb_table2 <- train_test$nb.pred[train_test$train.target == 1]
bg_nb_table2 <- train_test$nb.pred[train_test$train.target == 0]
auc_roc_nb_table2 <- roc.curve(scores.class0 = fg_nb_table2, scores.class1 = bg_nb_table2, curve = T)
auc_pr_nb_table2 <- pr.curve(scores.class0 = fg_nb_table2, scores.class1 = bg_nb_table2, curve = T)

# Calculate the accuracy,	precision,	recall,	F1 score,	AUC ROC, AUC PR using the logistic regression model
# Create confusion matrix
confusion_matrix_logistic_table2<-table(train_test$logistic.pred,train_test$train.target)
## Calculate true positive and true negative rate
TP_logistic_table2 <- confusion_matrix_logistic_table2[2,2]
TN_logistic_table2 <- confusion_matrix_logistic_table2[1,1]
## Calculate false positive and true negative rate
FP_logistic_table2 <- confusion_matrix_logistic_table2[1,2]
FN_logistic_table2 <- confusion_matrix_logistic_table2[2,1]
## Calculate accuracy
accuracy_logistic_table2 <- (TP_logistic_table2 + TN_logistic_table2) / (TP_logistic_table2 + TN_logistic_table2 + FP_logistic_table2 + FN_logistic_table2)
## Calculate false negative rate
false_negative_rate_logistic_table2 <- FN_logistic_table2 / (TP_logistic_table2 + TN_logistic_table2 + FP_logistic_table2 + FN_logistic_table2)
## Calculate false positive rate
false_positive_rate_logistic_table2 <- FP_logistic_table2 / (TP_logistic_table2 + TN_logistic_table2 + FP_logistic_table2 + FN_logistic_table2)
## Calculate precision
precision_logistic_table2 <- (TP_logistic_table2) / (TP_logistic_table2 + FP_logistic_table2)
## Calculate recall
recall_logistic_table2 <- (TP_logistic_table2) / (TP_logistic_table2 + FN_logistic_table2)
# Calculate F1 score
F1_logistic_table2 <- (2 * precision_logistic_table2 * recall_logistic_table2) / (precision_logistic_table2 + recall_logistic_table2)
# Calculate AUC ROC and AUC PR 	
fg_logistic_table2 <- train_test$logistic.pred[train_test$train.target == 1]
bg_logistic_table2 <- train_test$logistic.pred[train_test$train.target == 0]
auc_roc_logistic_table2 <- roc.curve(scores.class0 = fg_logistic_table2, scores.class1 = bg_logistic_table2, curve = T)
auc_pr_logistic_table2 <- pr.curve(scores.class0 = fg_logistic_table2, scores.class1 = bg_logistic_table2, curve = T)

# Calculate the accuracy,	precision,	recall,	F1 score,	AUC ROC, AUC PR using the majority vote model
# Create confusion matrix
confusion_matrix_majority_table2<-table(train_test$majority.pred,train_test$train.target)
## Calculate true positive and true negative rate
TP_majority_table2 <- confusion_matrix_majority_table2[2,2]
TN_majority_table2 <- confusion_matrix_majority_table2[1,1]
## Calculate false positive and true negative rate
FP_majority_table2 <- confusion_matrix_majority_table2[1,2]
FN_majority_table2 <- confusion_matrix_majority_table2[2,1]
## Calculate accuracy
accuracy_majority_table2 <- (TP_majority_table2 + TN_majority_table2) / (TP_majority_table2 + TN_majority_table2 + FP_majority_table2 + FN_majority_table2)
## Calculate false negative rate
false_negative_rate_majority_table2 <- FN_majority_table2 / (TP_majority_table2 + TN_majority_table2 + FP_majority_table2 + FN_majority_table2)
## Calculate false positive rate
false_positive_rate_majority_table2 <- FP_majority_table2 / (TP_majority_table2 + TN_majority_table2 + FP_majority_table2 + FN_majority_table2)
## Calculate precision
precision_majority_table2 <- (TP_majority_table2) / (TP_majority_table2 + FP_majority_table2)
## Calculate recall
recall_majority_table2 <- (TP_majority_table2) / (TP_majority_table2 + FN_majority_table2)
# Calculate F1 score
F1_majority_table2 <- (2 * precision_majority_table2 * recall_majority_table2) / (precision_majority_table2 + recall_majority_table2)
# Calculate AUC ROC and AUC PR 	
fg_majority_table2 <- train_test$majority.pred[train_test$train.target == 1]
bg_majority_table2 <- train_test$majority.pred[train_test$train.target == 0]
auc_roc_majority_table2 <- roc.curve(scores.class0 = fg_majority_table2, scores.class1 = bg_majority_table2, curve = T)
auc_pr_majority_table2 <- pr.curve(scores.class0 = fg_majority_table2, scores.class1 = bg_majority_table2, curve = T)

# Create Table 2
svm_table2<-cbind(accuracy_svm_table2,precision_svm_table2,recall_svm_table2, F1_svm_table2, auc_roc_svm_table2$auc, auc_pr_svm_table2$auc.integral)
nb_table2<-cbind(accuracy_nb_table2,precision_nb_table2,recall_nb_table2, F1_nb_table2, auc_roc_nb_table2$auc, auc_pr_nb_table2$auc.integral)
logistic_table2<-cbind(accuracy_logistic_table2,precision_logistic_table2,recall_logistic_table2, F1_logistic_table2, auc_roc_logistic_table2$auc, auc_pr_logistic_table2$auc.integral)
majority_table2<-cbind(accuracy_majority_table2,precision_majority_table2,recall_majority_table2, F1_majority_table2, auc_roc_majority_table2$auc, auc_pr_majority_table2$auc.integral)
table2<-as.data.frame(rbind(svm_table2,nb_table2,logistic_table2,majority_table2))
colnames(table2) <- c("Accuracy",	"Precision",	"Recall",	"F1 Score", "AUC ROC", "AUC PR")
rownames(table2) <- c("Support Vector Machine", "Naive Bayes", "Logistic Regression", "Majority Vote")
round(table2, 3)

################################################################################
# Table 3: Out of sample model accuracy with 5x repeated k-fold cross validation
################################################################################

# Read in test data document term matrix
test<-read.csv("test_data_dtm.csv", header=TRUE, stringsAsFactors=FALSE)

# Calculate the accuracy,	precision,	recall,	F1 score,	AUC ROC, AUC PR using the support vector machine model
# Create confusion matrix
confusion_matrix_svm_table3<-table(test$svm.pred,test$test.target)
## Calculate true positive and true negative rate
TP_svm_table3 <- confusion_matrix_svm_table3[2,2]
TN_svm_table3 <- confusion_matrix_svm_table3[1,1]
## Calculate false positive and true negative rate
FP_svm_table3 <- confusion_matrix_svm_table3[1,2]
FN_svm_table3 <- confusion_matrix_svm_table3[2,1]
## Calculate accuracy
accuracy_svm_table3 <- (TP_svm_table3 + TN_svm_table3) / (TP_svm_table3 + TN_svm_table3 + FP_svm_table3 + FN_svm_table3)
## Calculate false negative rate
false_negative_rate_svm_table3 <- FN_svm_table3 / (TP_svm_table3 + TN_svm_table3 + FP_svm_table3 + FN_svm_table3)
## Calculate false positive rate
false_positive_rate_svm_table3 <- FP_svm_table3 / (TP_svm_table3 + TN_svm_table3 + FP_svm_table3 + FN_svm_table3)
## Calculate precision
precision_svm_table3 <- (TP_svm_table3) / (TP_svm_table3 + FP_svm_table3)
## Calculate recall
recall_svm_table3 <- (TP_svm_table3) / (TP_svm_table3 + FN_svm_table3)
# Calculate F1 score
F1_svm_table3 <- (2 * precision_svm_table3 * recall_svm_table3) / (precision_svm_table3 + recall_svm_table3)
# Calculate AUC ROC and AUC PR 	
fg_svm_table3 <- test$svm.pred[test$test.target == 1]
bg_svm_table3 <- test$svm.pred[test$test.target == 0]
auc_roc_svm_table3 <- roc.curve(scores.class0 = fg_svm_table3, scores.class1 = bg_svm_table3, curve = T)
auc_pr_svm_table3 <- pr.curve(scores.class0 = fg_svm_table3, scores.class1 = bg_svm_table3, curve = T)

# Calculate the accuracy,	precision,	recall,	F1 score,	AUC ROC, AUC PR using the naive bayes model
# Create confusion matrix
confusion_matrix_nb_table3<-table(test$nb.pred,test$test.target)
## Calculate true positive and true negative rate
TP_nb_table3 <- confusion_matrix_nb_table3[2,2]
TN_nb_table3 <- confusion_matrix_nb_table3[1,1]
## Calculate false positive and true negative rate
FP_nb_table3 <- confusion_matrix_nb_table3[1,2]
FN_nb_table3 <- confusion_matrix_nb_table3[2,1]
## Calculate accuracy
accuracy_nb_table3 <- (TP_nb_table3 + TN_nb_table3) / (TP_nb_table3 + TN_nb_table3 + FP_nb_table3 + FN_nb_table3)
## Calculate false negative rate
false_negative_rate_nb_table3 <- FN_nb_table3 / (TP_nb_table3 + TN_nb_table3 + FP_nb_table3 + FN_nb_table3)
## Calculate false positive rate
false_positive_rate_nb_table3 <- FP_nb_table3 / (TP_nb_table3 + TN_nb_table3 + FP_nb_table3 + FN_nb_table3)
## Calculate precision
precision_nb_table3 <- (TP_nb_table3) / (TP_nb_table3 + FP_nb_table3)
## Calculate recall
recall_nb_table3 <- (TP_nb_table3) / (TP_nb_table3 + FN_nb_table3)
# Calculate F1 score
F1_nb_table3 <- (2 * precision_nb_table3 * recall_nb_table3) / (precision_nb_table3 + recall_nb_table3)
# Calculate AUC ROC and AUC PR 	
fg_nb_table3 <- test$nb.pred[test$test.target == 1]
bg_nb_table3 <- test$nb.pred[test$test.target == 0]
auc_roc_nb_table3 <- roc.curve(scores.class0 = fg_nb_table3, scores.class1 = bg_nb_table3, curve = T)
auc_pr_nb_table3 <- pr.curve(scores.class0 = fg_nb_table3, scores.class1 = bg_nb_table3, curve = T)

# Calculate the accuracy,	precision,	recall,	F1 score,	AUC ROC, AUC PR using the logistic regression model
# Create confusion matrix
confusion_matrix_logistic_table3<-table(test$logistic.pred,test$test.target)
## Calculate true positive and true negative rate
TP_logistic_table3 <- confusion_matrix_logistic_table3[2,2]
TN_logistic_table3 <- confusion_matrix_logistic_table3[1,1]
## Calculate false positive and true negative rate
FP_logistic_table3 <- confusion_matrix_logistic_table3[1,2]
FN_logistic_table3 <- confusion_matrix_logistic_table3[2,1]
## Calculate accuracy
accuracy_logistic_table3 <- (TP_logistic_table3 + TN_logistic_table3) / (TP_logistic_table3 + TN_logistic_table3 + FP_logistic_table3 + FN_logistic_table3)
## Calculate false negative rate
false_negative_rate_logistic_table3 <- FN_logistic_table3 / (TP_logistic_table3 + TN_logistic_table3 + FP_logistic_table3 + FN_logistic_table3)
## Calculate false positive rate
false_positive_rate_logistic_table3 <- FP_logistic_table3 / (TP_logistic_table3 + TN_logistic_table3 + FP_logistic_table3 + FN_logistic_table3)
## Calculate precision
precision_logistic_table3 <- (TP_logistic_table3) / (TP_logistic_table3 + FP_logistic_table3)
## Calculate recall
recall_logistic_table3 <- (TP_logistic_table3) / (TP_logistic_table3 + FN_logistic_table3)
# Calculate F1 score
F1_logistic_table3 <- (2 * precision_logistic_table3 * recall_logistic_table3) / (precision_logistic_table3 + recall_logistic_table3)
# Calculate AUC ROC and AUC PR 	
fg_logistic_table3 <- test$logistic.pred[test$test.target == 1]
bg_logistic_table3 <- test$logistic.pred[test$test.target == 0]
auc_roc_logistic_table3 <- roc.curve(scores.class0 = fg_logistic_table3, scores.class1 = bg_logistic_table3, curve = T)
auc_pr_logistic_table3 <- pr.curve(scores.class0 = fg_logistic_table3, scores.class1 = bg_logistic_table3, curve = T)

# Calculate the accuracy,	precision,	recall,	F1 score,	AUC ROC, AUC PR using the majority vote model
# Create confusion matrix
confusion_matrix_majority_table3<-table(test$majority.pred,test$test.target)
## Calculate true positive and true negative rate
TP_majority_table3 <- confusion_matrix_majority_table3[2,2]
TN_majority_table3 <- confusion_matrix_majority_table3[1,1]
## Calculate false positive and true negative rate
FP_majority_table3 <- confusion_matrix_majority_table3[1,2]
FN_majority_table3 <- confusion_matrix_majority_table3[2,1]
## Calculate accuracy
accuracy_majority_table3 <- (TP_majority_table3 + TN_majority_table3) / (TP_majority_table3 + TN_majority_table3 + FP_majority_table3 + FN_majority_table3)
## Calculate false negative rate
false_negative_rate_majority_table3 <- FN_majority_table3 / (TP_majority_table3 + TN_majority_table3 + FP_majority_table3 + FN_majority_table3)
## Calculate false positive rate
false_positive_rate_majority_table3 <- FP_majority_table3 / (TP_majority_table3 + TN_majority_table3 + FP_majority_table3 + FN_majority_table3)
## Calculate precision
precision_majority_table3 <- (TP_majority_table3) / (TP_majority_table3 + FP_majority_table3)
## Calculate recall
recall_majority_table3 <- (TP_majority_table3) / (TP_majority_table3 + FN_majority_table3)
# Calculate F1 score
F1_majority_table3 <- (2 * precision_majority_table3 * recall_majority_table3) / (precision_majority_table3 + recall_majority_table3)
# Calculate AUC ROC and AUC PR 	
fg_majority_table3 <- test$majority.pred[test$test.target == 1]
bg_majority_table3 <- test$majority.pred[test$test.target == 0]
auc_roc_majority_table3 <- roc.curve(scores.class0 = fg_majority_table3, scores.class1 = bg_majority_table3, curve = T)
auc_pr_majority_table3 <- pr.curve(scores.class0 = fg_majority_table3, scores.class1 = bg_majority_table3, curve = T)

# Create Table 3
svm_table3<-cbind(accuracy_svm_table3,precision_svm_table3,recall_svm_table3, F1_svm_table3, auc_roc_svm_table3$auc, auc_pr_svm_table3$auc.integral)
nb_table3<-cbind(accuracy_nb_table3,precision_nb_table3,recall_nb_table3, F1_nb_table3, auc_roc_nb_table3$auc, auc_pr_nb_table3$auc.integral)
logistic_table3<-cbind(accuracy_logistic_table3,precision_logistic_table3,recall_logistic_table3, F1_logistic_table3, auc_roc_logistic_table3$auc, auc_pr_logistic_table3$auc.integral)
majority_table3<-cbind(accuracy_majority_table3,precision_majority_table3,recall_majority_table3, F1_majority_table3, auc_roc_majority_table3$auc, auc_pr_majority_table3$auc.integral)
table3<-as.data.frame(rbind(svm_table3,nb_table3,logistic_table3,majority_table3))
colnames(table3) <- c("Accuracy",	"Precision",	"Recall",	"F1 Score", "AUC ROC", "AUC PR")
rownames(table3) <- c("Support Vector Machine", "Naive Bayes", "Logistic Regression", "Majority Vote")
round(table3, 3)

################################################################################
# Table 4: Top 20 keywords with the greatest frequency in the allegation data
################################################################################

# Subset allegation data
allegs<-snarp_allegation_data[snarp_allegation_data$alleg.dummy==1,]

# Create document term matrix
allegs_corp <- Corpus(VectorSource(allegs$train.dict.text))
allegs_dtm <- TermDocumentMatrix(allegs_corp)

# Calculate top 20 terms
allegs_mtx <- as.matrix(allegs_dtm)
allegs_freq <- sort(rowSums(allegs_mtx),decreasing=TRUE)
table4 <- data.frame(Frequency=allegs_freq)

# Create Table 4
head(table4, 20)

################################################################################################################################################################################################################################################

#####################################################################################
# Figure 1: Number of physical integrity rights allegations over time by organization
#####################################################################################

reports.year <- read.table(text = "1999  2000  2001   2002   2003   2004   2005   2006    2007   2008   2009   2010   2011   2012  2013  2014  2015  2016
                            H 804  821 1029  897    0  463  559  641  717  717  757  798  980  941  962  991 1002  991
                            A 1841 2074 1915 1902 1768 1639 1482 1549 1539 1539 1568 1673 1768 1748    0 1606 1578 1473
                            S 6976 8749 9298 7654 6654 6302 6385 7294 6538 6538 7202 7166 5555 5909 5636 5750 5493 5557", header = TRUE,check.names = FALSE)

# PLOT
par(mar=c(6, 5, 5, 9), xpd=TRUE)
reports<-barplot(as.matrix(reports.year),
                 col = c("darkslategrey", "darkcyan", "mediumseagreen"),
                 legend.text=c("Human Rights Watch", "Amnesty International", "US State Department"), xlab="Year", ylab="Number of Allegations", font.main = 4,args.legend = list(x = "topright", bty = "n", inset=c(-0.25, 0), title="Organization"))

par(mar=c(5,5.5,2,.5), mfrow=c(1,3), cex.lab=1.25, font=2)
barplot(as.matrix(reports.year[1,]), space=0, las=2, col="darkslategrey", main="Human Rights Watch", ylim=c(0,10000))
mtext(side=2, "Number of Allegations", line=4.0, cex.lab=1.25)
barplot(as.matrix(reports.year[2,]), space=0, las=2, col="darkcyan", main="Amnesty International", ylim=c(0,10000))
barplot(as.matrix(reports.year[3,]), space=0, las=2, col="mediumseagreen", main="US State Department", xlab="", ylim=c(0,10000))

############################################
# Figure 2: Number of allegations by country
############################################

COLORS <- c("white", "#fee5d9", "#fcbba1", "#fc9272", "#fb6a4a", "#ef3b2c", "#cb181d", "#99000d", "black")
par(mfrow=c(2,2))

# Amnesty International Reports
tab <- xtabs( ~ iso3c + source.short, data=allegs)
ai <- data.frame(tab[,1])
names(ai) <- "count"
ai$region <- countrycode(row.names(ai), origin = "iso3c", destination = "country.name")
ai$region[ai$region=="Côte d’Ivoire"] <- "Ivory Coast"
ai$region[ai$region=="Congo - Kinshasa"] <- "Democratic Republic of the Congo"
ai$region[ai$region=="Congo - Brazzaville"] <- "Republic of Congo"
ai$region[ai$region=="United Kingdom"] <- "UK"
ai$region[ai$region=="United States"] <- "USA"

# PLOT
map.world <- map_data("world")
ggplot(ai, aes(map_id = region)) +
  geom_map(aes(fill = count), map = map.world) +
  ggtitle(paste("Allegations from Amnesty International Reports, 1999-2016", sep="")) +
  xlab("Latitude") + ylab("Longitude") +
  coord_map("rectangular", lat0=0, xlim=c(-180,180), ylim=c(-60, 90)) +
  expand_limits(x = map.world$long, y = map.world$lat) + scale_fill_gradientn(colours=COLORS, na.value=grey(.875), guide = "colourbar")

# Human Rights Watch Reports
hrw <- data.frame(tab[,2])
names(hrw) <- "count"
hrw$region <- countrycode(row.names(hrw), origin = "iso3c", destination = "country.name")
hrw$region[hrw$region=="Côte d’Ivoire"] <- "Ivory Coast"
hrw$region[hrw$region=="Congo - Kinshasa"] <- "Democratic Republic of the Congo"
hrw$region[hrw$region=="Congo - Brazzaville"] <- "Republic of Congo"
hrw$region[hrw$region=="United Kingdom"] <- "UK"
hrw$region[hrw$region=="United States"] <- "USA"

#PLOT
ggplot(hrw, aes(map_id = region)) +
    geom_map(aes(fill = count), map = map.world) +
    ggtitle(paste("Allegations from Human Rights Watch Reports, 1999-2016", sep="")) +
    xlab("Latitude") + ylab("Longitude") +
    coord_map("rectangular", lat0=0, xlim=c(-180,180), ylim=c(-60, 90)) +
    expand_limits(x = map.world$long, y = map.world$lat) + scale_fill_gradientn(colours=COLORS, na.value=grey(.875), guide = "colourbar")

# US State Department Reports
sd <- data.frame(tab[,3])
names(sd) <- "count"
sd$region <- countrycode(row.names(sd), origin = "iso3c", destination = "country.name")
sd$region[sd$region=="Côte d’Ivoire"] <- "Ivory Coast"
sd$region[sd$region=="Congo - Kinshasa"] <- "Democratic Republic of the Congo"
sd$region[sd$region=="Congo - Brazzaville"] <- "Republic of Congo"
sd$region[sd$region=="United Kingdom"] <- "UK"
sd$region[sd$region=="United States"] <- "USA"

#PLOT
ggplot(sd, aes(map_id = region)) +
  geom_map(aes(fill = count), map = map.world) +
  ggtitle(paste("Allegations from US State Department Reports, 1999-2016", sep="")) +
  xlab("Latitude") + ylab("Longitude") +
  coord_map("rectangular", lat0=0, xlim=c(-180,180), ylim=c(-60, 90)) +
  expand_limits(x = map.world$long, y = map.world$lat) + scale_fill_gradientn(colours=COLORS, na.value=grey(.875), guide = "colourbar")

################################################################################################################################################################################################################################################

#######################################################################
# APPENDIX B Physical integrity rights keywords using the training data
#######################################################################

# Produce list of unique terms
allegs_terms<-rownames(allegs_mtx)
appendixb<-allegs_terms[order(allegs_terms)]
appendixb_end<-c("","","")
appendixb<-append(appendixb,appendixb_end)
appendixb <- lapply(list, function(x) data.frame(matrix(appendixb, ncol = 5, byrow = TRUE)))

#############################################################################################################################
# APPENDIX C Example of how our automated method codes sentences in our training data as physical integrity rights allegation
#############################################################################################################################

# Subset sentences
sentences<-snarp_allegation_data[snarp_allegation_data$sentence.id=="17200" |
                                snarp_allegation_data$sentence.id=="1458438" |
                                snarp_allegation_data$sentence.id=="1207060" |
                                snarp_allegation_data$sentence.id=="646139" |
                                snarp_allegation_data$sentence.id=="1559735" |
                                snarp_allegation_data$sentence.id=="209468",]
sentences<-sentences[, c("country", "year", "source.short", "sentence.id", "original.text", "train.dict.common.text","alleg.prob", "alleg.dummy")]
setnames(sentences, old = c("source.short", "original.text", "alleg.prob", "alleg.dummy"), new = c("source", "sentence", "pr.allegation", "final.allegation.dataset"))
sentences$final.allegation.dataset<-ifelse(sentences$final.allegation.dataset=='1','Yes','No')
sentences$source<-ifelse(sentences$source=='A','Amnesty International', ifelse(sentences$source=='H','Human Rights Watch', 'US State Department'))
sentences<-sentences[order(-rank(sentences$final.allegation.dataset), -rank(sentences$pr.allegation), sentences$country, sentences$year, decreasing = FALSE), ]
sentences$pr.allegation<-round(sentences$pr.allegation, 2)

# Calculate frequency of common terms and key words
# Create document term matrix
sentences_corp <- Corpus(VectorSource(sentences$train.dict.common.text))
sentences_dtm <- DocumentTermMatrix(sentences_corp)
sentences_mtx<-as.matrix(sentences_dtm)
sentences_dta<-as.data.frame(sentences_mtx)
sentences_dta[sentences_dta==0]<-NA
sentence_17200<-sentences_dta[1,]
sentence_1458438<-sentences_dta[2,]
sentence_1207060<-sentences_dta[3,]
sentence_646139<-sentences_dta[4,]
sentence_1559735<-sentences_dta[5,]
sentence_209468<-sentences_dta[6,]
sentence_17200<-sentence_17200[, colSums(is.na(sentence_17200)) != nrow(sentence_17200)]
sentence_1458438<-sentence_1458438[, colSums(is.na(sentence_1458438)) != nrow(sentence_1458438)]
sentence_1207060<-sentence_1207060[, colSums(is.na(sentence_1207060)) != nrow(sentence_1207060)]
sentence_646139<-sentence_646139[, colSums(is.na(sentence_646139)) != nrow(sentence_646139)]
sentence_1559735<-sentence_1559735[, colSums(is.na(sentence_1559735)) != nrow(sentence_1559735)]
sentence_209468<-sentence_209468[, colSums(is.na(sentence_209468)) != nrow(sentence_209468)]

# Create table APPENDIX C
sentence_17200<-sentence_17200[, order(names(sentence_17200))]
sentence_1458438<-sentence_1458438[, order(names(sentence_1458438))]
sentence_1207060<-sentence_1207060[, order(names(sentence_1207060))]
sentence_646139<-sentence_646139[, order(names(sentence_646139))]
sentence_1559735<-sentence_1559735[, order(names(sentence_1559735))]
sentence_209468<-sentence_209468[, order(names(sentence_209468))]
sentence_17200<-data.frame(freq=unlist(sentence_17200))
sentence_1458438<-data.frame(freq=unlist(sentence_1458438))
sentence_1207060<-data.frame(freq=unlist(sentence_1207060))
sentence_646139<-data.frame(freq=unlist(sentence_646139))
sentence_1559735<-data.frame(freq=unlist(sentence_1559735))
sentence_209468<-data.frame(freq=unlist(sentence_209468))
sentence_17200$term <- rownames(sentence_17200)
sentence_1458438$term <- rownames(sentence_1458438)
sentence_1207060$term <- rownames(sentence_1207060)
sentence_646139$term <- rownames(sentence_646139)
sentence_1559735$term <- rownames(sentence_1559735)
sentence_209468$term <- rownames(sentence_209468)
sentence_17200_dtm <- paste(sentence_17200$term,sentence_17200$freq)
sentence_1458438_dtm <- paste(sentence_1458438$term,sentence_1458438$freq)
sentence_1207060_dtm <- paste(sentence_1207060$term,sentence_1207060$freq)
sentence_646139_dtm <- paste(sentence_646139$term,sentence_646139$freq)
sentence_1559735_dtm <- paste(sentence_1559735$term,sentence_1559735$freq)
sentence_209468_dtm <- paste(sentence_209468$term,sentence_209468$freq)
sentence_17200_dtm<-paste(sentence_17200_dtm, sep=" ", collapse=" ") 
sentence_1458438_dtm<-paste(sentence_1458438_dtm, sep=" ", collapse=" ") 
sentence_1207060_dtm<-paste(sentence_1207060_dtm, sep=" ", collapse=" ") 
sentence_646139_dtm<-paste(sentence_646139_dtm, sep=" ", collapse=" ") 
sentence_1559735_dtm<-paste(sentence_1559735_dtm, sep=" ", collapse=" ") 
sentence_209468_dtm<-paste(sentence_209468_dtm, sep=" ", collapse=" ") 
sentences_all_dtm<-as.data.frame(rbind(sentence_17200_dtm,sentence_1458438_dtm,sentence_1207060_dtm,sentence_646139_dtm,sentence_1559735_dtm,sentence_209468_dtm))
sentences$document.term.matrix<-sentences_all_dtm$V1
appendixc<-sentences[c(1:5,9,7,8)]
View(appendixc)

####################################################
# APPENDIX D Allegations from All Reports, 1999-2016
####################################################

# Human Rights Watch Reports
all <- data.frame(tab)
all<-all %>% 
  group_by(iso3c) %>% 
  summarise(count = sum(Freq))
all$region <- countrycode(all$iso3c, origin = "iso3c", destination = "country.name")
all$region[all$region=="Côte d’Ivoire"] <- "Ivory Coast"
all$region[all$region=="Congo - Kinshasa"] <- "Democratic Republic of the Congo"
all$region[all$region=="Congo - Brazzaville"] <- "Republic of Congo"
all$region[all$region=="United Kingdom"] <- "UK"
all$region[all$region=="United States"] <- "USA"

#PLOT
ggplot(all, aes(map_id = region)) +
  geom_map(aes(fill = count), map = map.world) +
  ggtitle(paste("Allegations from Human Rights Watch Reports, 1999-2016", sep="")) +
  xlab("Latitude") + ylab("Longitude") +
  coord_map("rectangular", lat0=0, xlim=c(-180,180), ylim=c(-60, 90)) +
  expand_limits(x = map.world$long, y = map.world$lat) + scale_fill_gradientn(colours=COLORS, na.value=grey(.875), guide = "colourbar")

###########################################################################################
# APPENDIX E.1 Correspondence Between the Political Terror Scale and Allegation Sentences
#
# and
#
# APPENDIX E.2 Correspondence Between Latent Human Rights Scores and Allegation Sentences
##########################################################################################

tab_yr <- xtabs(  ~ iso3c + year + source.short, data=allegs)
alleg_yr <- data.frame(tab_yr[,,1])
names(alleg_yr) <- c("iso3c", "year", "AI_count")
alleg_yr$HRW_count <- data.frame(tab_yr[,,2])$Freq
alleg_yr$SD_count <- data.frame(tab_yr[,,3])$Freq
alleg_yr$CCODE <- countrycode(alleg_yr$iso3c, origin = "iso3c", destination = "cown")

# Merge human rights protection scores data to allegation data
hr_data <- readRDS("HumanRightsProtectionScores_v4.01.Rdata")
alleg_yr <- merge(alleg_yr, hr_data, by.x=c("CCODE", "year"), by.y=c("COW", "YEAR"))

# PLOT Figure E.1, first row
par(mfrow=c(1,3), mar=c(4,4,2,.5))
boxplot(alleg_yr$AI_count ~ alleg_yr$Amnesty, horizontal=TRUE, main="Amnesty International", ylab="PTS Amnesty International", xlab="Allegation Sentences", cex.lab=1.25)
boxplot(alleg_yr$HRW_count ~ alleg_yr$HRW, horizontal=TRUE, main="Human Rights Watch", ylab="PTS HRW", xlab="Allegation Sentences", cex.lab=1.25)
boxplot(alleg_yr$SD_count ~ alleg_yr$State, horizontal=TRUE, main="US State Department", ylab="PTS State", xlab="Allegation Sentences", cex.lab=1.25)

# PLOT Figure E.1, second row
par(mfrow=c(1,3), mar=c(4,4,2,.5))
boxplot(log10(alleg_yr$AI_count+1) ~ alleg_yr$Amnesty, horizontal=TRUE, main="Amnesty International", ylab="PTS Amnesty International", xlab="Allegation Sentences", xaxt="n", cex.lab=1.25)
axis(side=1, at=c(0,.5,1,1.5,2,2.5), labels=c(expression(10^1),expression(10^1.5),expression(10^2),expression(10^2.5),expression(10^3),expression(10^3.5)), cex.axis=1.25)
boxplot(log10(alleg_yr$HRW_count+1) ~ alleg_yr$HRW, horizontal=TRUE, main="Human Rights Watch", ylab="PTS HRW", xlab="Allegation Sentences", xaxt="n", cex.lab=1.25)
axis(side=1, at=c(0,.5,1,1.5,2,2.5), labels=c(expression(10^1),expression(10^1.5),expression(10^2),expression(10^2.5),expression(10^3),expression(10^3.5)), cex.axis=1.25)
boxplot(log10(alleg_yr$SD_count+1) ~ alleg_yr$State, horizontal=TRUE, main="US State Department", ylab="PTS State", xlab="Allegation Sentences", xaxt="n", ylim=c(0,2.5), cex.lab=1.25)
axis(side=1, at=c(0,.5,1,1.5,2,2.5), labels=c(expression(10^1),expression(10^1.5),expression(10^2),expression(10^2.5),expression(10^3),expression(10^3.5)), cex.axis=1.25)

# PLOT Figure E,2, first row
par(mfrow=c(1,3), mar=c(4,4,2,.5))
plot(alleg_yr$theta_mean ~ alleg_yr$AI_count, main="Amnesty International", ylab="Latent Human Rights Scores", xlab="Allegation Sentences", cex.lab=1.25, col=grey(.75))
lines(lowess(y=alleg_yr$theta_mean, x=alleg_yr$AI_count), col=2, lwd=2)
plot(alleg_yr$theta_mean ~ alleg_yr$HRW_count, main="Human Rights Watch", ylab="Latent Human Rights Scores", xlab="Allegation Sentences", cex.lab=1.25, col=grey(.75))
lines(lowess(y=alleg_yr$theta_mean, x=alleg_yr$HRW_count), col=2, lwd=2)
plot(alleg_yr$theta_mean ~ alleg_yr$SD_count, main="US State Department", ylab="Latent Human Rights Scores", xlab="Allegation Sentences",  cex.lab=1.25, col=grey(.75))
lines(lowess(y=alleg_yr$theta_mean, x=alleg_yr$SD_count), col=2, lwd=2)

# PLOT Figure E,2, second row
par(mfrow=c(1,3), mar=c(4,4,2,.5))
plot(alleg_yr$theta_mean ~ log10(alleg_yr$AI_count+1), main="Amnesty International", ylab="Latent Human Rights Scores", xlab="Allegation Sentences", xaxt="n", cex.lab=1.25, col=grey(.75))
lines(lowess(y=alleg_yr$theta_mean, x=log10(alleg_yr$AI_count+1)), col=2, lwd=2)
axis(side=1, at=c(0,.5,1,1.5,2,2.5), labels=c(expression(10^1),expression(10^1.5),expression(10^2),expression(10^2.5),expression(10^3),expression(10^3.5)), cex.axis=1.25)
plot(alleg_yr$theta_mean ~ log10(alleg_yr$HRW_count+1), main="Human Rights Watch", ylab="Latent Human Rights Scores", xlab="Allegation Sentences", xaxt="n", cex.lab=1.25, col=grey(.75))
lines(lowess(y=alleg_yr$theta_mean, x=log10(alleg_yr$HRW_count+1)), col=2, lwd=2)
axis(side=1, at=c(0,.5,1,1.5,2,2.5), labels=c(expression(10^1),expression(10^1.5),expression(10^2),expression(10^2.5),expression(10^3),expression(10^3.5)), cex.axis=1.25)
plot(alleg_yr$theta_mean ~ log10(alleg_yr$SD_count+1), main="US State Department", ylab="Latent Human Rights Scores", xlab="Allegation Sentences", xaxt="n", xlim=c(0,2.5), cex.lab=1.25, col=grey(.75))
lines(lowess(y=alleg_yr$theta_mean, x=log10(alleg_yr$SD_count+1)), col=2, lwd=2)
axis(side=1, at=c(0,.5,1,1.5,2,2.5), labels=c(expression(10^1),expression(10^1.5),expression(10^2),expression(10^2.5),expression(10^3),expression(10^3.5)), cex.axis=1.25)

# Spearman rank-order correlations for each variable
cor<-round(c(cor(alleg_yr$Amnesty, alleg_yr$AI_count, method="spearman", use="pairwise"),
             cor(alleg_yr$HRW, alleg_yr$HRW_count, method="spearman", use="pairwise"),
             cor(alleg_yr$State, alleg_yr$SD_count, method="spearman", use="pairwise"),
             cor(alleg_yr$theta_mean, alleg_yr$AI_count, method="spearman", use="pairwise"),
             cor(alleg_yr$theta_mean, alleg_yr$HRW_count, method="spearman", use="pairwise"),
             cor(alleg_yr$theta_mean, alleg_yr$SD_count, method="spearman", use="pairwise")),digits=3)

# Figure E.1 note: "The Spearman rank-order correlations for each variable are 0.733, 0.630, and 0.840, respectively."
cor[1:3]

# Figure E.2 note: "The Spearman rank-order correlations for each variable are -0.733, -0.626, and -0.883, respectively."
cor[4:6]

####################################################################################################
# APPENDIX E.3 Correspondence Between Country-Year Political Terror Scale Variables and Top 20 Terms
#
# and
#
# APPENDIX E.4 Correspondence Between Country-Year Latent Human Rights Scores Variable and Top 20 Terms
####################################################################################################

text_vector <- as.character(snarp_allegation_data$stemmed.text)
text_list <- strsplit(text_vector, split=" ")

top20_terms <- c("kill", "arrest", "tortur", "forc", "detent", "prison", "secur", "arbitrari", "alleg", "beat", "detain", "humanright", "abus", "charg", "attack", "death", "disappear", "suspect", "sever", "treatment")

value <- list()

for(j in 1:20){
    data <- alleg_yr
    
    test_vector <- lapply(1:length(text_list), function(i){
        sum(as.numeric(grepl(top20_terms[j], text_list[[i]])))
    })
    
    snarp_allegation_data$test_vector <- unlist(test_vector)
    
    tab <- xtabs(test_vector ~ iso3c + year + source.short, data=snarp_allegation_data)
    sum(tab)
    
    test_data <- data.frame(tab[,,1])
    head(test_data)
    
    names(test_data) <- c("iso3c", "year", "AI_test")
    
    test_data$HRW_test <- data.frame(tab[,,2])$Freq
    test_data$SD_test <- data.frame(tab[,,3])$Freq
    head(test_data)
    
    test_data$CCODE <- countrycode(test_data$iso3c, origin = "iso3c", destination = "cown")
    head(test_data)
    
    data <- merge(data, test_data, by.x=c("CCODE", "year"), by.y=c("CCODE", "year"))
    head(data)

    value[[j]] <- c(cor(data$AI_test, data$Amnesty, method="spearman", use="pairwise"),
                    cor(data$HRW_test, data$HRW, method="spearman", use="pairwise"),
                    cor(data$SD_test, data$State, method="spearman", use="pairwise"),
                    cor(data$AI_test, data$theta_mean, method="spearman", use="pairwise"),
                    cor(data$HRW_test, data$theta_mean, method="spearman", use="pairwise"),
                    cor(data$SD_test, data$theta_mean, method="spearman", use="pairwise"))

}

value_mat <- do.call("rbind", value)

# Create Table E.3 
pts_cor<-value_mat[,1:3]
pts_cor<-round(pts_cor, digits=3)
pts_cor<-cbind(top20_terms,pts_cor)
pts_cor

# Create Table E.4
hrs_cor<-value_mat[,4:6]
hrs_cor<-round(hrs_cor, digits=3)
hrs_cor<-cbind(top20_terms,hrs_cor)
hrs_cor
